path <- "/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/popgen_analyses/clustering_PCA/admixture/MaciRef_vcfAllfRef"
dt <- read.delim("/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/messor_contig_AllGeneticData.txt") %>%
    rename(Ind = sra_access_number) %>%
    mutate(lineage = case_when(mitochondrial %in% "" ~ species, TRUE ~ mitochondrial))
linCol <- c(brewer.pal(3, "Greens")[-2], brewer.pal(8, "Blues")[c(2, 5, 7)], brewer.pal(9,
    "Reds")[c(2, 3, 5, 7, 8, 9)], "black", "#6350B5", "#BF88C2", "#C7D144", "#6DCEB6",
    "#CA4AC3", "#D4953D", "#7735D8", "black")
names(linCol) <- c("mMbar1", "mMbar2", "mMebe1", "mMebe3", "mMebe2", "structor3",
    "ponticus2", "mcarthuri7", "muticus6", "structor4", "ibericus1", "aciculatus",
    "arenarius", "capitatus", "decipiens", "minor", "nodentatus", "oertzeni", "wasmanni",
    "sp.")

spCol <- c("gray80", "black", "#6350B5", brewer.pal(3, "Greens")[2], "#BF88C2", "#C7D144",
    brewer.pal(8, "Blues")[6], "#6DCEB6", "#CA4AC3", "#D4953D", "black", brewer.pal(9,
        "Reds")[6], "#7735D8")

names(spCol) <- dt %>%
    arrange(species) %>%
    pull(species) %>%
    unique() %>%
    c("0", .)

bestK <- function(pathAnalysis = pathAanalysis) {

    cvdt <- list.files(paste0(pathAnalysis, "/stdout"), pattern = "stdout", recursive = T,
        full.names = T)

    cvsOut <- function(cv) {
        cvd <- read.delim(cv)
        cvs <- data.frame(run = word(cv, -2, sep = fixed(".")), cvError = cvd[str_detect(cvd[,
            1], "CV error"), ])
        return(cvs)
    }

    bestK <- map_dfr(cvdt, cvsOut) %>%
        mutate(cvError = as.numeric(word(cvError, sep = ": ", 2, 2))) %>%
        separate(run, into = c("K", "rep"), sep = "_") %>%
        group_by(K) %>%
        summarize(meanCVerror = mean(cvError)) %>%
        filter(meanCVerror == min(meanCVerror)) %>%
        pull(K)
    return(bestK)
}

admixturePlot <- function(path = path, analysis = analysis, labelSize = labelSize) {
    samples <- read.table(list.files(paste0(path, "/", analysis), pattern = "_inds.txt",
        recursive = F, full.names = T))$V1
    lst <- list.files(paste0(path, "/", analysis, "/CLUMPAK"), pattern = "ClumppIndFile.output",
        recursive = T, full.names = T)

    lst <- lst[str_detect(lst, paste0("K=", bestK(paste(path, analysis, sep = "/")))) &
        str_detect(lst, "MajorCluster")]

    dtset <- dt %>%
        filter(Ind %in% samples) %>%
        arrange(Ind %in% samples) %>%
        mutate(order = paste0("V", 1:nrow(.)), ID = paste0(substring(caste, 1, 1),
            "_", Ind)) %>%
        rename(pop = lineage) %>%
        select(ID, Ind, pop, order)

    outfile <- read.table(lst, header = FALSE) %>%
        select(-(V1:V5))
    outfile$Ind <- samples

    setLabel <- gsub("/", "_", gsub("/CLUMPP.files/ClumppIndFile.output", "", unlist(str_split(lst,
        pattern = "CLUMPAK/"))[2]))

    colnames(outfile)[1:(ncol(outfile) - 1)] <- paste0("P", 1:(ncol(outfile) - 1))

    dtCluster <- outfile %>%
        left_join(dtset, by = "Ind") %>%
        pivot_longer(-c(order, ID, pop, Ind), names_to = "P") %>%
        # mutate(ch = word(ID, sep = '_', 1,1)) %>%
    arrange(desc(ID))

    p1 = ggplot(data = dtCluster, aes(y = fct_inorder(ID), x = value, fill = P, group = pop)) +
        geom_bar(stat = "identity", show.legend = FALSE, colour = "black", width = 1,
            size = 0.1) + scale_fill_iwanthue() + facet_grid(pop ~ ., scales = "free",
        space = "free") + theme_bw() + theme(plot.background = element_blank(), panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), panel.border = element_blank(), panel.spacing = unit(0,
            "lines"), strip.background = element_rect(fill = "gray95"), strip.text.y = element_text(size = labelSize,
            angle = 0), axis.title.y = element_blank(), axis.title.x = element_blank(),
        axis.text.y = element_text(size = labelSize), legend.position = "none", strip.placement = "outside") +
        scale_x_continuous(expand = c(0, 0))
    return(p1)
}

pcaPlot <- function(path = path, analysis = analysis, vcf = vcfi) {
    gen <- vcfR2genind(vcfi)
    loci <- locNames(gen)

    x <- tab(gen, NA.method = "mean")
    pca = dudi.pca(x, center = TRUE, scannf = FALSE, scale = FALSE, nf = 3)

    percent = pca$eig/sum(pca$eig) * 100

    PCAdt <- data.frame(pca$li) %>%
        rownames_to_column("Ind") %>%
        left_join(filter(dt, Ind %in% indNames(gen)), by = "Ind") %>%
        mutate(Ind_h = paste0(Ind, substring(hybrid, 1, 1)))

    PCAdt <- PCAdt %>%
        group_by(species) %>%
        dplyr::summarise(across(Axis1:Axis3, mean, na.rm = TRUE)) %>%
        full_join(PCAdt, by = "species", suffix = c(".cen", ""))

    # Custom x and y labels
    axis1Lab = paste("Axis 1 (", format(round(percent[1], 1), nsmall = 1), " %)",
        sep = "")
    axis2Lab = paste("Axis 2 (", format(round(percent[2], 1), nsmall = 1), " %)",
        sep = "")

    ggtheme <- function(axisA, axisB, axisAlab, axisBlab) {
        ggplot(data = PCAdt, aes_string(x = axisA, y = axisB)) + labs(x = axisAlab,
            y = axisBlab) + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) +
            geom_point(aes(color = datatype_nuc, shape = caste, fill = lineage),
                size = 3, show.legend = TRUE, alpha = 0.85) + geom_text_repel(aes(label = Ind_h),
            max.overlaps = 15, size = 3, show.legend = FALSE, segment.size = 0.3) +
            scale_shape_manual(values = c(21, 24)) + scale_fill_manual(values = linCol[unique(PCAdt$lineage)]) +
            scale_colour_manual(values = c("black", "white")) + theme(axis.text.y = element_text(colour = "black",
            size = 12), axis.text.x = element_text(colour = "black", size = 12),
            axis.title = element_text(colour = "black", size = 12), panel.border = element_rect(colour = "black",
                fill = NA, size = 1), panel.background = element_blank(), plot.title = element_text(hjust = 0.5,
                size = 15), legend.position = "bottom") + labs(fill = "", color = "",
            shape = "") + guides(shape = guide_legend(nrow = 2), colour = guide_legend(nrow = 2),
            fill = guide_legend(nrow = 2, override.aes = list(shape = 21, alpha = 1)))
    }

    # Scatter plot axis 1 vs. 2
    PCAplot <- ggtheme("Axis1", "Axis2", axis1Lab, axis2Lab)

    return(annotate_figure(PCAplot, bottom = text_grob(paste(nLoc(gen), "SNPs used"),
        hjust = 1, x = 1, size = 10)))
}

Phylogenetic analyses across Messor

Exploration with dated trees

COI analyses

Analyses done with COI including NCBI genbank sequences with other species. This COI analysis was initially done to attempt to get a rough timing of splits across the history of this group of species. In Ward et al. 2015 chronogram, there seems to be a suggestion for the split age between two species for which there is COI sequences and were present in that study: M. wasmani and M. denticornis. So it was used a secondary calibration of 6Mya for the split between those species and allowing for this event to range from 4.5 to 7.5 Mya, with an alignment that includes the genbank available sequences and the COI sequences for a subset of samples in this project data set. M. aciculatus comes out as an outgroup but the node of the three clades of interest is different using a coalescent approach like beast vs raxml. It’s unclear if the outer group of the three clades is M. structor clade (RAxML) or the M. ebeninus clade (Beast), since in both cases that node has low support (< 0.8 posterior for Beast and < 50 bootstrap for RAxML).


tree <- read.raxml("/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/alignments/messorNCBI/runBeast2_qslClean/RAxML_bipartitionsBranchLabels.NCBIseq_qslNoExcl_COI_RAxMLOut")

tree2 <- root(tree, which(tree@phylo$tip.label %in% c("EF518450_pergandei", "DQ074362_chamberlini",
    "DQ353349_julianus")))

rr <- ggtree(tree2, ladderize = TRUE, right = TRUE)
r2 <- rr + geom_tiplab(hjust = 0, size = 3, show.legend = F) + xlim(0, max(rr$data$x) +
    0.08) + geom_treescale(x = 0, y = 0) + geom_label2(aes(label = bootstrap, subset = as.numeric(sub("/.*",
    "", bootstrap)) > 50), size = 3, nudge_x = 0, label.padding = unit(0.1, "lines")) +
    labs(title = "RAxML tree")

beastMCC <- read.beast("/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/alignments/messorNCBI/runBeast2_qslClean/NCBIseq_qslNoExcl_COI_MCC.tree")
beastMCC2 <- beastMCC
beastMCC2@phylo <- reorder(phangorn::midpoint(beastMCC2@phylo))

bb <- revts(ggtree(beastMCC2, ladderize = TRUE, right = TRUE))
b2 <- bb + geom_tiplab(aes(label = label), size = 3) + geom_range("height_0.95_HPD",
    color = "blue", size = 1, alpha = 0.3) + geom_label2(aes(x = branch, label = round(as.numeric(posterior),
    2)), size = 3, nudge_y = 0.5, label.padding = unit(0.1, "lines")) + scale_x_continuous(labels = abs,
    breaks = -(0:15), limits = c(min(bb$data$x), 1.4)) + theme_tree2() + labs(title = "MCC tree from Beast analysis")
b2 + r2 + plot_layout(widths = c(1, 0.7))

# ,subset=posterior > 0.8

Observing the distribution of posterior trees also suggest some difficulty in sorting the position of Messor aciculatus and the inconsistency in sorting the position of the three clades of interest:

Densitree figure of the Beast analysis



SNP-based analyses

These issues can also be visualized in some explorations using SNAPP, which is a Beast-like analysis applying the Multispecies Coalescent but using SNPs instead of sequences. For this it was applied a few very rough calibration points from the COI analysis and used SNPs thinned from a vcf file that were subset to 2000 SNPs with at least 500bp intervals, following this tutorial: https://github.com/mmatschiner/tutorials/blob/master/species_tree_inference_with_snp_data/README.md

For this subsetting I just considered queen RNA-seq samples to avoid the bias of having different data sets, so no samples from structor4 and mcarthuri7 were included.


module load bcftools vcftools beast2
source /shared/ifbstor1/software/miniconda/etc/profile.d/conda.sh
conda activate ruby

bcftools merge -l qsl2_vcfList -m all --threads 24 -Ou -0 | bcftools view --threads 24 -e 'AC==0 || AC==AN || F_MISSING > 0.2' -m2 -M2 -Oz -o qsl2_MaciRef_RNAseq_vcf_filtered.vcf.gz 

vcftools --gzvcf qsl2_MaciRef_RNAseq_vcf_filtered.vcf.gz  --recode --thin 100 --out qsl2_MaciRef_RNAseq_vcf_filteredThin  
mv qsl2_MaciRef_RNAseq_vcf_filteredThin.recode.vcf qsl2_MaciRef_RNAseq_vcf_filteredThin.vcf

ruby ../../snapp_prep.rb -v qsl2_MaciRef_RNAseq_vcf_filteredThin.vcf -t qsl2_popID.txt -c constraintsSP.txt -q 500 -m 2000 -l 1000000 -s qsl2_busco_concatenatedUPGMA_sp.tree -o qsl2_sp #using a quick upgma tree from concatenated busco loci as a starting tree, could be the COI tree too as long as using a starting tree where the constraints are possible. The busco based tree was used because of uncertainties due to mito-nuclear discordances so using a nuclear based tree would be potentially less biased.

wait
beast -threads 24 snapp.xml 
beastMCC <- read.beast("/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/phylo_analyses/analyses/qsl2/qsl2_VCF_SNAPPsp/qsl2_sp_MCC.tree")

p <- revts(ggtree(beastMCC, ladderize = TRUE))
p + geom_tiplab(aes(label = label), size = 4) + geom_range("height_0.95_HPD", color = "blue",
    size = 1, alpha = 0.3) + geom_nodelab(aes(x = branch, label = round(posterior,
    2), subset = posterior > 0.8), size = 4, size = 1.8, nudge_y = 0.2) + scale_x_continuous(labels = abs,
    breaks = -(0:15)) + coord_cartesian(clip = "off") + theme_tree2(plot.margin = margin(6,
    40, 6, 6))

Densitree figure of the SNAPP analysis


The dating constraints were avoiding constraining the position of the outgroup and of the three different clades, so in this analysis M. aciculatus doesn’t position itself as an outgroup and there is some noise in the nearby clade.

ggtree is acting weird so I can’t really reroot the SNAPP tree without losing the posterior probabilities, anyway with ape plotting we can see forcing M. aciculatus as outgroup makes the tree a bit weird.

nex <- read.nexus("/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/phylo_analyses/analyses/qsl2/qsl2_VCF_SNAPPsp/qsl2_sp_MCC.tree")
tree2 <- root(nex, "aciculatus")

ggtree(tree2) + geom_tiplab() + xlim(0, 11)


If we constrain SNAPP with the ingroup as monophyletic, i.e M. aciculatus is the outgroup, then we get a very short node and the M. ebeninus clade as the outer clade.

beastMCC <- read.beast("/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/phylo_analyses/analyses/qsl2/qsl2_VCF_SNAPPsp-AciMono/qsl2_sp_AciMonoCombinedMCC.tree")

p <- revts(ggtree(beastMCC, ladderize = TRUE))
p + geom_tiplab(aes(label = label), size = 4) + geom_range("height_0.95_HPD", color = "blue",
    size = 1, alpha = 0.3) + geom_nodelab(aes(x = branch, label = round(posterior,
    2), subset = posterior > 0.8), size = 4, size = 1.8, nudge_y = 0.2) + scale_x_continuous(labels = abs,
    breaks = -(0:15)) + coord_cartesian(clip = "off") + theme_tree2(plot.margin = margin(6,
    40, 6, 6))


The densiTree plot is even more confusing to interpret:

Densitree figure of the SNAPP analysis when forcing M. aciculatus as an outgroup


Additionally, I also repeated the first analysis where each sample correspond to a “species”. It’s kind of interesting because it shows some of the patterns that can be identified with the population genetics methods, like the lack of structure between the supposedly M. ebeninus lineages and how the two individuals from this Mebe3 lineage (called after looking at the mitochondrial placement of these samples) seem to place differently with one closer to the M.ebeninus samples and the other with M. wasmanni.

beastMCC <- read.beast("/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/phylo_analyses/analyses/qsl2/qsl2_VCF_SNAPPsample/qsl2_sampleMCC.tree")

p <- revts(ggtree(beastMCC, ladderize = TRUE))
p + geom_tiplab(aes(label = label), size = 4) + geom_range("height_0.95_HPD", color = "blue",
    size = 1, alpha = 0.3) + geom_nodelab(aes(x = branch, label = round(posterior,
    2), subset = posterior > 0.8), size = 4, size = 1.8, nudge_y = 0.2) + scale_x_continuous(labels = abs,
    breaks = -(0:10), limits = c(-10, 2.5)) + theme_tree2()

Densitree figure of the SNAPP analysis per sample


The SNP-based phylogenetic analysis seems to also support that M.ebeninus clade as the outer clade like the beast analysis for the COI samples, but this uncertainty is unclear if it’s only due to the issue with the outgroup or if this issue is worsen by possible introgression/ILS signal.



Analyses with all samples

Presenting the phylogenetic trees done with all the samples for both mitochondrial data and busco loci with consensus calls in order to include the males and hybrids. These analyses were performed with iq-tree to be able to infer concordance factors and use these to help understand discordances. The interpretation is explained here http://www.robertlanfear.com/blog/files/archive-2018.html, but the general idea is that the support values being shown correspond to bootstrap values similar to what would be estimated using RAxML with a concatenated alignment, the gene concordance factors gCF and the site concordance factors sCF. These concordance factors are estimated for each branch on a tree produced by maximum likelihood on a concatenated alignment. The support values near the tips are less helpful but I didn’t feel like figuring out a way to exclude them so I just filtered out any values where the bootstrap was below 50. The high bootstrap values show how the sampling variance for branches is low however when the gCF is low we either can’t resolve the single locus tree (when branches as very short) and/or there are many loci that have conflicting signal, while if the sCF is relatively similar to the gCF when what causes low gCF are discordances. If the gCF values are affected by lack of information, then gCF values can be a lot lower than sCF values. As an example, a sCF of about 33% shows that there is not overwhelming support for any particular resolution of a given branch.

The branches are colored by species but the tip labels are colored by supposed mitochondrial lineage. As prefix for the tip labels is the hybrid status as calculated by Arthur and the caste is represented by q/w/m as a tip point. The phylogenies are presented with branch lengths to show the divergence levels between clades. The trees were rooted with M. aciculatus because I wasn’t able to figure out how to do mipoint rooting with ggtree so I wouldn’t root the trees an outgroup; just hide the outgroup branching to be easier for visualization.


mitogenome_iqtree vs busco_iqtree

tree <- read.iqtree('/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/phylo_analyses/analyses/all/allMessor_mitogenome_iqtree/concord.cf.tree')
nLoci <- nrow(read.delim('/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/phylo_analyses/analyses/all/allMessor_mitogenome_iqtree/geneTrees.treefile'))
#tree <- treeio::drop.tip(tree, c("SRR4292897","SRR4292898"))

dtTree <- dt %>%
  filter(Ind %in% tree@phylo$tip.label) %>%
  mutate(label2 = paste0(substring(hybrid,1,1),"_", Ind, "_",lineage))

grp <- list()
for (sp in unique(dtTree$species)){
  grp[[sp]] <- dtTree[dtTree$species %in% sp,'Ind']
}

grouped <- groupOTU(root(tree, which(tree@phylo$tip.label %in% c("SRR4292897",
                                                                     "SRR4292898"))),
                      .node= grp)
# tree2 <- tree
# tree2@phylo <-reorder(phangorn::midpoint(tree2@phylo)) ## to use a midpoint root

# grouped <- groupOTU(tree,.node= grp)

p <- ggtree(grouped, #branch.length='none', 
            ladderize=TRUE, right = TRUE)%<+%
  dtTree[,c('Ind','caste','lineage','label2')] 

q1 <- p + 
  geom_tree(aes(color=group)) +
  scale_color_manual(name = "Species",
                     values = spCol[names(spCol) %in% names(grp)]) +
  ggnewscale::new_scale_color() +
  geom_tiplab(size=2, aes(label = label2, color=lineage), 
              hjust = -0.09, aligned = TRUE, show.legend = F) +
  scale_color_manual(values = linCol) +
  guides(color=guide_legend(ncol=1)) +
  ggnewscale::new_scale_color() +
  geom_tippoint(aes(x=x+0.003, shape = caste, color = caste), 
                size = 2, na.rm = TRUE) +
   scale_shape_manual(values = c(utf8ToInt("q"), utf8ToInt("m"), utf8ToInt("w")),
                     breaks = c("queen", "male", "worker")) +
  scale_color_manual(values = c( "#FFAD56" ,"#59A3F7", "#C5EABD"),
                     breaks = c("queen", "male", "worker")) +
  xlim(0.1, max(p$data$x)+0.01) +
  geom_nodelab(aes(subset = as.numeric(sub("/.*", "", label)) > 50), 
               size = 2, hjust = 1, nudge_y = 0.5) +
  labs(title = "Mitogenome",
       caption = paste(nLoci,"loci used for mitochondrial analyses")) +
  theme(legend.position = "bottom") + 
  geom_treescale(x=0.1, y=0) 


tree <- read.iqtree('/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/phylo_analyses/analyses/all/allMessor_busco_iqtree/concord.cf.tree')

nLoci <- nrow(read.delim('/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/phylo_analyses/analyses/all/allMessor_busco_iqtree/geneTrees.treefile'))

# tree <- treeio::drop.tip(tree, c("SRR4292897","SRR4292898"))

dtTree <- dt %>%
  filter(Ind %in% tree@phylo$tip.label) %>%
  mutate(label2 = paste0(substring(hybrid,1,1),"_", Ind, "_",lineage))

grp <- list()
for (sp in unique(dtTree$species)){
  grp[[sp]] <- dtTree[dtTree$species %in% sp,'Ind']
}

grouped <- groupOTU(root(tree, which(tree@phylo$tip.label %in% 
                                       c("SRR4292897","SRR4292898"))),
                    .node= grp)
# tree2 <- tree
# tree2@phylo <-reorder(phangorn::midpoint(tree2@phylo)) ## to use a midpoint root
# grouped <- groupOTU(tree2,.node= grp)

p <- ggtree(grouped, #branch.length='none', 
            ladderize=TRUE, right = TRUE)%<+%
  dtTree[,c('Ind','caste','lineage','label2')] 

q2 <- p + 
  geom_tree(aes(color=group)) +
  scale_color_manual(name = "Species",
                     values = spCol[names(spCol) %in% names(grp)]) +
  ggnewscale::new_scale_color() +
  geom_tiplab(size=2, aes(label = label2, color=lineage), 
              hjust = -0.09, aligned = TRUE, show.legend = F) +
  scale_color_manual(values = linCol) +
  guides(color=guide_legend(ncol=1)) +
  ggnewscale::new_scale_color() +
  geom_tippoint(aes(x=x+0.0002, shape = caste, color = caste), 
                size = 2, na.rm = TRUE) +
   scale_shape_manual(values = c(utf8ToInt("q"), utf8ToInt("m"), utf8ToInt("w")),
                     breaks = c("queen", "male", "worker")) +
  scale_color_manual(values = c( "#FFAD56" ,"#59A3F7", "#C5EABD"),
                     breaks = c("queen", "male", "worker")) +
  xlim(0.013, max(p$data$x)+0.007) +
  geom_nodelab(aes(subset = as.numeric(sub("/.*", "", label)) > 50), 
               size = 2, hjust = 1, nudge_y = 0.5) +
  labs(title = "Nuclear with consensus calls", 
       caption = paste(nLoci,"busco loci used for nuclear analyses")) +
  theme(legend.position = "bottom") + 
  geom_treescale(x=0.013, y=0) 

q1 + q2 + plot_layout(guides = "collect") & theme(legend.position = 'bottom')


General comments from observing and comparing these iq-tree phylogenies:

  • The branching of the main clades likely have ILS given the short nodes and some rather low gCF values pointing at gene tree discordances and/or low quality gene trees from the busco loci.

  • Both mitochondrial and nuclear trees seem to suggest that M. structor clade is the outer clade, which is different from the SNAPP analysis. The node that splits M. aciculatus/M.structorClade/the rest has high values of support accross concordance factors, while the node that splits M.ebeninusClade from M.barbarusClade has lower support for the concordance factors. Need to go dig a bit deeper to see the support for ILS for this node or the effect of the outgroup (however there are no other outgroups that we could use).

  • Something weird is going on with M.ebeninus mitochondrial relationships because using other phylogenetic approaches Mebe1 and Mebe2 do seem to have some clustering but perhaps the low divergence and the presence of hybrids in the tree might lower the support for that topology with iq-tree. With RAxML and a concatenated alignment I31 and I11 samples were named Mebe3 but here there is little support for the existence of three mitochondrial lineages inside M.ebeninus. The nuclear also doesn’t support much intraspecific structuring, however I31 does appear closer to M. wasmanni, which is also supported by other analyses.

  • M. barbarus clade is pretty consistent, but using consensus sequences with the nuclear data set one would expect part of the workers going to either Mbar1 or Mbar2 clades, as it happens for the mitochondrial DNA, however, all workers except 1 are nested with Mbar1 clade. This phylogenetic approach is not appropriate to take this interpretation but I was wondering if this would be suggesting some asymmetry between the lineages.

  • Not relevant for the main questions but the position of M. oertzeni might be inconsistent between markers due to the same issues of outgroup vs ILS.

  • The M. structor clades can be recovered with the mitochondrial iq-tree analysis but the support values seem to suggest some noise (which might be due to the presence of hybrids in the analysis) and Ponticus2 looks nested within Ibericus1. However, in the nuclear phylogeny, it just seems two samples called as Ibericus1 for the mitochondrial RAxML analysis are Ponticus2 and this clade is sister to all queens and males of Ibericus1.

  • Mcarthuri7 seems to be a consistent clade between mtDNA and nuclear, but the position changes, being an outer clade within M. structor lineages for the mtDNA but inside a clade that is sister to the muticus6/structor3/structor4 clade. These nodes have very low support for concordance factors for the nuclear but they are not that high for the mtDNA. Additionaly, this clade (that consists of all hybrids and the queen) is sister to a clade several hybrids of ibericus1 and one ponticus2 worker.

  • Muticus6 splits into two clades, one that includes the queen and is sister to structor3/4 clade for both markers, and the other that corresponds to only workers and outside of the muticus6/structor3/4 clade. Although with low support, this weird clade seems to be closely related to one queen of structor3 that is not closely related to the other structor3 queens.

  • Like for Mebe1 and Mebe2 is not clear from phylogenetic analyses that there is much divergence between a potential structor 3 and 4 clade. There is a male with mitochondrial of ibericus closely related to this structor3/4 clade. Need to check if it comes from any colony from other queens or workers.

  • I think perhaps ponticus2 and mcarthuri7 might not do social hybridogenesis, or at least not in the same way, since queens and hybrids (except one hybrid of ponticus2) seem to stay in the same clade, similarly how other species without social hybridogenesis. Nonetheless, I think these discordances and position of some branches might point to admixture between mcarthuri7 and ibericus1, ibercus1 and ponticus2, structor3 with muticus, and then perhaps something between ibericus and muticus/structor3/4 as well mcarthuri7 and ponticus2. This probably will be more clear when looking to colonies origin and geographic location.



Analyses with just Queens

The phylogenetic analyses with all samples are a bit hard to interpret because of the presence of males and workers so I included several comparisons of just using all available queens. The comparison between genetic markers (mtDNA vs nuclear), type of method (iq-tree vs concatenation with RAxML) and variant calling (consensus calls vs phased). Overall interpretation of patterns seem to be similar, but I show trees with support values more clearly.


mitogenome_iqtree vs busco_iqtree

tree <- read.iqtree("/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/phylo_analyses/analyses/all/messorQueens_mitogenome_iqtree/concord.cf.tree")

dtTree <- dt %>%
    filter(Ind %in% tree@phylo$tip.label) %>%
    mutate(label2 = paste0(substring(caste, 1, 1), "_", Ind, "_", lineage))

grp <- list()
for (sp in unique(dtTree$species)) {
    grp[[sp]] <- dtTree[dtTree$species %in% sp, "Ind"]
}

groupedIQ <- groupOTU(root(tree, which(tree@phylo$tip.label %in% "SRR4292897")),
    .node = grp)

p <- ggtree(groupedIQ, branch.length = "none", ladderize = TRUE, right = TRUE) %<+%
    dtTree[, c("Ind", "caste", "lineage", "label2")]

q1 <- p + geom_tree(aes(color = group)) + scale_color_manual(name = "Species", values = spCol[names(spCol) %in%
    names(grp)]) + ggnewscale::new_scale_color() + geom_tiplab(size = 3, aes(label = label2,
    color = lineage), hjust = 0, aligned = TRUE, show.legend = F) + scale_color_manual(values = linCol) +
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  guides(color
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  =
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  guide_legend(ncol
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  =
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  1))
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  +
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  xlim(0,
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  max(p$data$x)
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  +
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  6)
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  +
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  #geom_nodelab(aes(subset
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  =
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  as.numeric(sub('/.*',
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  '',
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  label))
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  >
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  50),
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  size
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  =
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  3)
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  +
geom_label2(aes(label = label, subset = as.numeric(sub("/.*", "", label)) > 50),
    size = 3, nudge_x = -0.5, label.padding = unit(0.1, "lines")) + labs(title = "Mitogenome") +
    theme(legend.position = "bottom")

tree <- read.iqtree("/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/phylo_analyses/analyses/all/messorQueens_busco_iqtree/concord.cf.tree")

nLoci <- nrow(read.delim("/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/phylo_analyses/analyses/all/messorQueens_busco_iqtree/geneTrees.treefile"))
dtTree <- dt %>%
    filter(Ind %in% tree@phylo$tip.label) %>%
    mutate(label2 = paste0(substring(caste, 1, 1), "_", Ind, "_", lineage))

grp <- list()
for (sp in unique(dtTree$species)) {
    grp[[sp]] <- dtTree[dtTree$species %in% sp, "Ind"]
}

groupedIQ <- groupOTU(root(tree, which(tree@phylo$tip.label %in% "SRR4292897")),
    .node = grp)

p <- ggtree(groupedIQ, branch.length = "none", ladderize = TRUE, right = TRUE) %<+%
    dtTree[, c("Ind", "caste", "lineage", "label2")]

q2 <- p + geom_tree(aes(color = group)) + scale_color_manual(name = "Species", values = spCol[names(spCol) %in%
    names(grp)]) + ggnewscale::new_scale_color() + geom_tiplab(size = 3, aes(label = label2,
    color = lineage), hjust = 0, aligned = TRUE, show.legend = F) + scale_color_manual(values = linCol) +
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  guides(color
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  =
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  guide_legend(ncol
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  =
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  1))
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  +
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  xlim(0,
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  max(p$data$x)
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  +
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  6)
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  +
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  #geom_nodelab(aes(subset
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  =
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  as.numeric(sub('/.*',
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  '',
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  label))
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  >
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  50),
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  size
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  =
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  3)
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  +
geom_label2(aes(label = label, subset = as.numeric(sub("/.*", "", label)) > 50),
    size = 3, nudge_x = -0.5, label.padding = unit(0.1, "lines")) + labs(title = "Nuclear with consensus calls") +
    theme(legend.position = "bottom")

q1 + q2 + plot_layout(guides = "collect") & theme(legend.position = "bottom")

Using 2640 busco loci for nuclear analyses. Showing only support values for nodes with bootstrap values above 50.



busco_iqtree vs busco_raxmlConcatenated

tree <- read.raxml("/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/phylo_analyses/analyses/all/messorQueens_busco_concatenatedRAxML/RAxML_bipartitionsBranchLabels.messorQueens_busco")

dtTree <- dt %>%
    filter(Ind %in% tree@phylo$tip.label) %>%
    mutate(label2 = paste0(substring(caste, 1, 1), "_", Ind, "_", lineage))

grp <- list()
for (sp in unique(dtTree$species)) {
    grp[[sp]] <- dtTree[dtTree$species %in% sp, "Ind"]
}

groupedIQ <- groupOTU(root(tree, which(tree@phylo$tip.label %in% "SRR4292897")),
    .node = grp)

p <- ggtree(groupedIQ, branch.length = "none", ladderize = TRUE, right = TRUE) %<+%
    dtTree[, c("Ind", "caste", "lineage", "label2")]

q3 <- p + geom_tree(aes(color = group)) + scale_color_manual(name = "Species", values = spCol[names(spCol) %in%
    names(grp)]) + ggnewscale::new_scale_color() + geom_tiplab(size = 3, aes(label = label2,
    color = lineage), hjust = 0, aligned = TRUE, show.legend = F) + scale_color_manual(values = linCol) +
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 7) + geom_label2(aes(label = bootstrap,
    subset = as.numeric(sub("/.*", "", bootstrap)) > 50), size = 3, nudge_x = 0,
    label.padding = unit(0.1, "lines")) + labs(title = "RAxML tree with concatenated busco loci") +
    theme(legend.position = "bottom")
q2.1 <- q2 + labs(title = "Iq-tree tree")
q2.1 + q3 + plot_layout(guides = "collect") & theme(legend.position = "bottom")

Using 2640 busco loci for nuclear analyses. Showing only support values for nodes with bootstrap values above 50.



busco_iqtree vs phasedH0_iqtree

tree <- read.iqtree("/Users/acas/Dropbox/Post-docs/Roux_Lab/messor_analyses/phylo_analyses/analyses/all/messorQueens_phasedH0_iqtree/concord.cf.tree")

dtTree <- dt %>%
    filter(Ind %in% tree@phylo$tip.label) %>%
    mutate(label2 = paste0(substring(caste, 1, 1), "_", Ind, "_", lineage))

grp <- list()
for (sp in unique(dtTree$species)) {
    grp[[sp]] <- dtTree[dtTree$species %in% sp, "Ind"]
}

groupedIQ <- groupOTU(root(tree, which(tree@phylo$tip.label %in% "SRR4292897")),
    .node = grp)

p <- ggtree(groupedIQ, branch.length = "none", ladderize = TRUE, right = TRUE) %<+%
    dtTree[, c("Ind", "caste", "lineage", "label2")]

q4 <- p + geom_tree(aes(color = group)) + scale_color_manual(name = "Species", values = spCol[names(spCol) %in%
    names(grp)]) + ggnewscale::new_scale_color() + geom_tiplab(size = 3, aes(label = label2,
    color = lineage), hjust = 0, aligned = TRUE, show.legend = F) + scale_color_manual(values = linCol) +
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  guides(color
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  =
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  guide_legend(ncol
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  =
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  1))
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  +
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  xlim(0,
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  max(p$data$x)
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  +
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  6)
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  +
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  #geom_nodelab(aes(subset
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  =
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  as.numeric(sub('/.*',
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  '',
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  label))
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  >
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  50),
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  size
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  =
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  3)
    guides(color = guide_legend(ncol = 1)) + xlim(0, max(p$data$x) + 6) + #geom_nodelab(aes(subset = as.numeric(sub('/.*', '', label)) > 50), size = 3) +  +
geom_label2(aes(label = label, subset = as.numeric(sub("/.*", "", label)) > 50),
    size = 3, nudge_x = -0.5, label.padding = unit(0.1, "lines")) + labs(title = "Tree with phased calls and 1 haplotype") +
    theme(legend.position = "bottom")

q2.1 <- q2 + labs(title = "Tree with consensus calls")
q2.1 + q4 + plot_layout(guides = "collect") & theme(legend.position = "bottom")

Using 2640 busco loci for nuclear analyses. Showing only support values for nodes with bootstrap values above 50.




Clustering Per clade

Admixture analyses were run with per analysis VCF files obtained from merging per sample VCF files. This was done to decrease time of bcftools mpileup per dataset, but to improve overlap of sites across RNAseq and WGS samples a .bed file was created with a set of SNPs including a little over 1 million sites.

The .bed file was it self created from the cleaning of a qsl2 vcf (subset of RNAseq queen samples analysed with SNAPP). The filtering of this vcf file retrieved biallelic sites (indels excluded) with Qual above 20, read depth above 10 and allowing 10% missing data across sites.

Each sample VCF was called with keeping all sites, filtering (’FORMAT/DP>10 && QUAL>20) and then subsetting SNP sites from created .bed file. The final merged VCF file was then filtered for biallelic sites thinned every 500bp and allowing up to 25% missing data per site.

Admixture analyses were run aided by the admixturePipeline.py (https://github.com/stevemussmann/admixturePipeline) and evaluated with CLUMPAK.

Analyses were run for seven subclades datasets that include queens and workers. These datasets focus on the three social hybridogenesis groups of species: Mbar only and Mbar with close relatives; Mebe only and Mebe with close relatives; and M. structor as well as M. structor lineages split between the two main clades: Ibericus + ponticus + mcarthuri (Mstr127) and muticus + structor3 + structor4 (Mstr346). Still unsure if I shouldn’t also have analysed an Mstr3467 clade given the uncertainty of the position for this clade.

To be improved showing the admixture results with the bars aligned to different phylogenetic trees.



M. barbarus system

Overall results support the two Mbar clusters as expected with the workers showing admixed proportions between the two lineages. CLUMPAK analysis also support the best K for Mbar is two.


M.barbarus + Decipiens + Capitatus

analysis <- "MbarDecCap"

admixturePlot(path, analysis, 8)

vcfi <- read.vcfR(list.files(paste0(path, "/", analysis, "/"), pattern = "_filteredThin.recode.vcf",
    recursive = F, full.names = T), verbose = FALSE)
pcaPlot(path, analysis, vcfi)



M.barbarus

analysis <- "Mbar"

admixturePlot(path, analysis, 8)

vcfi <- read.vcfR(list.files(paste0(path, "/", analysis, "/"), pattern = "_filteredThin.recode.vcf",
    recursive = F, full.names = T), verbose = FALSE)
pcaPlot(path, analysis, vcfi)



M.ebeninus system

Included the genome assembly of M. aegyptiacus (Maeg) for the Mebe and close relatives analysis. This sample seems to cluster with Mebe3 in Admixture but maybe closer to the workers than I31. Regardless it seems more likely that due to the lack of clustering between Mebe1 and Mebe2, also supported with best K being 1 and not 2 for the Mebe analysis, that Mebe3 lineage would have a role in the social hybridogenesis system, since the workers seem to be between Mebe12 and Mebe3, but need to understand the colonies and geographic locations of the samples.


M.ebeninus + Wasmani + Minor

analysis <- "MebeWesMin"

admixturePlot(path, analysis, 8)

vcfi <- read.vcfR(list.files(paste0(path, "/", analysis, "/"), pattern = "_filteredThin.recode.vcf",
    recursive = F, full.names = T), verbose = FALSE)
pcaPlot(path, analysis, vcfi)



M.ebeninus

analysis <- "Mebe"

admixturePlot(path, analysis, 8)

vcfi <- read.vcfR(list.files(paste0(path, "/", analysis, "/"), pattern = "_filteredThin.recode.vcf",
    recursive = F, full.names = T), verbose = FALSE)
pcaPlot(path, analysis, vcfi)



M.structor system

mcarthuri7 doesn’t seem to be admixed with any of the lineages. There seems to be some suggestion for admixture between Structor3/4 and ponticus2 as well as with muticus6. The two samples with mtDNA of ibericus2 but ponticus2 in nuclear phylogenies seem to be admixed between both lineages. The best K doesn’t favor workers to have admixed proportions betwen both.


All M.structor

analysis <- "Mstr"

admixturePlot(path, analysis, 8)

vcfi <- read.vcfR(list.files(paste0(path, "/", analysis, "/"), pattern = "_filteredThin.recode.vcf",
    recursive = F, full.names = T), verbose = FALSE)
pcaPlot(path, analysis, vcfi)



Mstr127 - Ibericus + Ponticus + Mcarthuri

Still not clear to me which lineages are the parental lineages of ibericus workers, if ponticus2 or mcarthuri7.

analysis <- "Mstr127"

admixturePlot(path, analysis, 8)

vcfi <- read.vcfR(list.files(paste0(path, "/", analysis, "/"), pattern = "_filteredThin.recode.vcf",
    recursive = F, full.names = T), verbose = FALSE)
pcaPlot(path, analysis, vcfi)


Ibericus1 vs ponticus2

gen <- vcfR2genind(vcfi)
loci <- locNames(gen)
ind <- indNames(gen)

dtset <- dplyr::filter(dt, Ind %in% ind, lineage %in% c("ibericus1", "ponticus2"))
gen2 <- gen[which(ind %in% dtset$Ind)]

x <- tab(gen2, NA.method = "mean")
pca = dudi.pca(x, center = TRUE, scannf = FALSE, scale = FALSE, nf = 3)

percent = pca$eig/sum(pca$eig) * 100

PCAdt <- data.frame(pca$li) %>%
    rownames_to_column("Ind") %>%
    left_join(filter(dt, Ind %in% indNames(gen)), by = "Ind") %>%
    mutate(Ind_h = paste0(Ind, substring(hybrid, 1, 1)))

PCAdt <- PCAdt %>%
    group_by(species) %>%
    dplyr::summarise(across(Axis1:Axis3, mean, na.rm = TRUE)) %>%
    full_join(PCAdt, by = "species", suffix = c(".cen", ""))

# Custom x and y labels
axis1Lab = paste("Axis 1 (", format(round(percent[1], 1), nsmall = 1), " %)", sep = "")
axis2Lab = paste("Axis 2 (", format(round(percent[2], 1), nsmall = 1), " %)", sep = "")

ggtheme <- function(axisA, axisB, axisAlab, axisBlab) {
    ggplot(data = PCAdt, aes_string(x = axisA, y = axisB)) + labs(x = axisAlab, y = axisBlab) +
        geom_hline(yintercept = 0) + geom_vline(xintercept = 0) + geom_point(aes(color = datatype_nuc,
        shape = caste, fill = lineage), size = 3, show.legend = TRUE, alpha = 0.85) +
        geom_text_repel(aes(label = Ind_h), max.overlaps = 15, size = 3, show.legend = FALSE,
            segment.size = 0.3) + scale_shape_manual(values = c(21, 24)) + scale_fill_manual(values = linCol[unique(PCAdt$lineage)]) +
        scale_colour_manual(values = c("black", "white")) + theme(axis.text.y = element_text(colour = "black",
        size = 12), axis.text.x = element_text(colour = "black", size = 12), axis.title = element_text(colour = "black",
        size = 12), panel.border = element_rect(colour = "black", fill = NA, size = 1),
        panel.background = element_blank(), plot.title = element_text(hjust = 0.5,
            size = 15), legend.position = "bottom") + labs(fill = "", color = "",
        shape = "") + guides(shape = guide_legend(nrow = 2), colour = guide_legend(nrow = 2),
        fill = guide_legend(nrow = 2, override.aes = list(shape = 21, alpha = 1)))
}

# Scatter plot axis 1 vs. 2
PCAplot <- ggtheme("Axis1", "Axis2", axis1Lab, axis2Lab)

PCAplot


Ibericus1 vs mcarthuri7

gen <- vcfR2genind(vcfi)
loci <- locNames(gen)
ind <- indNames(gen)

dtset <- dplyr::filter(dt, Ind %in% ind, lineage %in% c("ibericus1", "mcarthuri7"))
gen2 <- gen[which(ind %in% dtset$Ind)]

x <- tab(gen2, NA.method = "mean")
pca = dudi.pca(x, center = TRUE, scannf = FALSE, scale = FALSE, nf = 3)

percent = pca$eig/sum(pca$eig) * 100

PCAdt <- data.frame(pca$li) %>%
    rownames_to_column("Ind") %>%
    left_join(filter(dt, Ind %in% indNames(gen)), by = "Ind") %>%
    mutate(Ind_h = paste0(Ind, substring(hybrid, 1, 1)))

PCAdt <- PCAdt %>%
    group_by(species) %>%
    dplyr::summarise(across(Axis1:Axis3, mean, na.rm = TRUE)) %>%
    full_join(PCAdt, by = "species", suffix = c(".cen", ""))

# Custom x and y labels
axis1Lab = paste("Axis 1 (", format(round(percent[1], 1), nsmall = 1), " %)", sep = "")
axis2Lab = paste("Axis 2 (", format(round(percent[2], 1), nsmall = 1), " %)", sep = "")

ggtheme <- function(axisA, axisB, axisAlab, axisBlab) {
    ggplot(data = PCAdt, aes_string(x = axisA, y = axisB)) + labs(x = axisAlab, y = axisBlab) +
        geom_hline(yintercept = 0) + geom_vline(xintercept = 0) + geom_point(aes(color = datatype_nuc,
        shape = caste, fill = lineage), size = 3, show.legend = TRUE, alpha = 0.85) +
        geom_text_repel(aes(label = Ind_h), max.overlaps = 15, size = 3, show.legend = FALSE,
            segment.size = 0.3) + scale_shape_manual(values = c(21, 24)) + scale_fill_manual(values = linCol[unique(PCAdt$lineage)]) +
        scale_colour_manual(values = c("black", "white")) + theme(axis.text.y = element_text(colour = "black",
        size = 12), axis.text.x = element_text(colour = "black", size = 12), axis.title = element_text(colour = "black",
        size = 12), panel.border = element_rect(colour = "black", fill = NA, size = 1),
        panel.background = element_blank(), plot.title = element_text(hjust = 0.5,
            size = 15), legend.position = "bottom") + labs(fill = "", color = "",
        shape = "") + guides(shape = guide_legend(nrow = 2), colour = guide_legend(nrow = 2),
        fill = guide_legend(nrow = 2, override.aes = list(shape = 21, alpha = 1)))
}

# Scatter plot axis 1 vs. 2
PCAplot <- ggtheme("Axis1", "Axis2", axis1Lab, axis2Lab)

PCAplot


Only Ibericus1 queens without funny ponticus2-like samples

gen <- vcfR2genind(vcfi)
loci <- locNames(gen)
ind <- indNames(gen)

dtset <- dplyr::filter(dt, Ind %in% ind, lineage %in% "ibericus1", caste %in% "queen",
    !Ind %in% c("RPODGQ", "RPODGW"))
gen2 <- gen[which(ind %in% dtset$Ind)]

x <- tab(gen2, NA.method = "mean")
pca = dudi.pca(x, center = TRUE, scannf = FALSE, scale = FALSE, nf = 3)

percent = pca$eig/sum(pca$eig) * 100

PCAdt <- data.frame(pca$li) %>%
    rownames_to_column("Ind") %>%
    left_join(filter(dt, Ind %in% indNames(gen)), by = "Ind") %>%
    mutate(Ind_h = paste0(Ind, substring(hybrid, 1, 1)))

PCAdt <- PCAdt %>%
    group_by(species) %>%
    dplyr::summarise(across(Axis1:Axis3, mean, na.rm = TRUE)) %>%
    full_join(PCAdt, by = "species", suffix = c(".cen", ""))

# Custom x and y labels
axis1Lab = paste("Axis 1 (", format(round(percent[1], 1), nsmall = 1), " %)", sep = "")
axis2Lab = paste("Axis 2 (", format(round(percent[2], 1), nsmall = 1), " %)", sep = "")

ggtheme <- function(axisA, axisB, axisAlab, axisBlab) {
    ggplot(data = PCAdt, aes_string(x = axisA, y = axisB)) + labs(x = axisAlab, y = axisBlab) +
        geom_hline(yintercept = 0) + geom_vline(xintercept = 0) + geom_point(aes(color = datatype_nuc,
        shape = caste, fill = lineage), size = 3, show.legend = TRUE, alpha = 0.85) +
        geom_text_repel(aes(label = Ind_h), max.overlaps = 15, size = 3, show.legend = FALSE,
            segment.size = 0.3) + scale_shape_manual(values = c(21, 24)) + scale_fill_manual(values = linCol[unique(PCAdt$lineage)]) +
        scale_colour_manual(values = c("black", "white")) + theme(axis.text.y = element_text(colour = "black",
        size = 12), axis.text.x = element_text(colour = "black", size = 12), axis.title = element_text(colour = "black",
        size = 12), panel.border = element_rect(colour = "black", fill = NA, size = 1),
        panel.background = element_blank(), plot.title = element_text(hjust = 0.5,
            size = 15), legend.position = "bottom") + labs(fill = "", color = "",
        shape = "") + guides(shape = guide_legend(nrow = 2), colour = guide_legend(nrow = 2),
        fill = guide_legend(nrow = 2, override.aes = list(shape = 21, alpha = 1)))
}

# Scatter plot axis 1 vs. 2
PCAplot <- ggtheme("Axis1", "Axis2", axis1Lab, axis2Lab)

PCAplot



Mstr346 - Structor3 + Structor4 + Muticus

The weird queen of structor3 is closer to the workers of structor 3 and these form a different group to the structor4 queens and workers and the other two structor3. The weird queen group is perhaps as distant from muticus6 as from the structor34 group. The two muticus2 samples below 0 are the two with any suggestion of admixture with structor34. Need to repeat Admixture just for structor3 and 4 lineages to see if the clustering as in the PCA is better supported than no clustering.


analysis <- "Mstr346"

admixturePlot(path, analysis, 8)

vcfi <- read.vcfR(list.files(paste0(path, "/", analysis, "/"), pattern = "_filteredThin.recode.vcf",
    recursive = F, full.names = T), verbose = FALSE)
pcaPlot(path, analysis, vcfi)